{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fit WMF (weighted matrix factorization) to the binarized ML20M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import itertools\n",
    "import os\n",
    "import sys\n",
    "os.environ['OPENBLAS_NUM_THREADS'] = '1'\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy import sparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import content_wmf\n",
    "import batched_inv_joblib\n",
    "import rec_eval"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load pre-processed data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Change this to wherever you saved the pre-processed data following [this notebook](./preprocess_ML20M.ipynb)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "DATA_DIR = '/hdd2/dawen/data/ml-20m/pro/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "unique_uid = list()\n",
    "with open(os.path.join(DATA_DIR, 'unique_uid.txt'), 'r') as f:\n",
    "    for line in f:\n",
    "        unique_uid.append(line.strip())\n",
    "    \n",
    "unique_sid = list()\n",
    "with open(os.path.join(DATA_DIR, 'unique_sid.txt'), 'r') as f:\n",
    "    for line in f:\n",
    "        unique_sid.append(line.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "111148 11711\n"
     ]
    }
   ],
   "source": [
    "n_items = len(unique_sid)\n",
    "n_users = len(unique_uid)\n",
    "\n",
    "print n_users, n_items"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def load_data(csv_file, shape=(n_users, n_items)):\n",
    "    tp = pd.read_csv(csv_file)\n",
    "    timestamps, rows, cols = np.array(tp['timestamp']), np.array(tp['uid']), np.array(tp['sid'])\n",
    "    seq = np.concatenate((rows[:, None], cols[:, None], np.ones((rows.size, 1), dtype='int'), timestamps[:, None]), axis=1)\n",
    "    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype=np.int16, shape=shape)\n",
    "    return data, seq"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train_data, train_raw = load_data(os.path.join(DATA_DIR, 'train.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vad_data, vad_raw = load_data(os.path.join(DATA_DIR, 'validation.csv'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "num_factors = 100\n",
    "num_iters = 50\n",
    "batch_size = 1000\n",
    "\n",
    "n_jobs = 4\n",
    "lam_theta = lam_beta = 1e-5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "best_ndcg = -np.inf\n",
    "U_best = None\n",
    "V_best = None\n",
    "best_alpha = 0\n",
    "\n",
    "for alpha in [2, 5, 10, 30, 50]: \n",
    "    S = content_wmf.linear_surplus_confidence_matrix(train_data, alpha=alpha)\n",
    "\n",
    "    U, V, vad_ndcg = content_wmf.factorize(S, num_factors, vad_data=vad_data, num_iters=num_iters, \n",
    "                                           init_std=0.01, lambda_U_reg=lam_theta, lambda_V_reg=lam_beta, \n",
    "                                           dtype='float32', random_state=98765, verbose=False, \n",
    "                                           recompute_factors=batched_inv_joblib.recompute_factors_batched, \n",
    "                                           batch_size=batch_size, n_jobs=n_jobs)\n",
    "    if vad_ndcg > best_ndcg:\n",
    "        best_ndcg = vad_ndcg\n",
    "        U_best = U.copy()\n",
    "        V_best = V.copy()\n",
    "        best_alpha = alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 0.35510611042\n"
     ]
    }
   ],
   "source": [
    "print best_alpha, best_ndcg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "test_data, _ = load_data(os.path.join(DATA_DIR, 'test.csv'))\n",
    "test_data.data = np.ones_like(test_data.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Recall@20: 0.1333\n",
      "Test Recall@50: 0.1647\n",
      "Test NDCG@100: 0.1602\n",
      "Test MAP@100: 0.0473\n"
     ]
    }
   ],
   "source": [
    "# alpha = 10 gives the best validation performance\n",
    "print 'Test Recall@20: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=20, vad_data=vad_data)\n",
    "print 'Test Recall@50: %.4f' % rec_eval.recall_at_k(train_data, test_data, U_best, V_best, k=50, vad_data=vad_data)\n",
    "print 'Test NDCG@100: %.4f' % rec_eval.normalized_dcg_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)\n",
    "print 'Test MAP@100: %.4f' % rec_eval.map_at_k(train_data, test_data, U_best, V_best, k=100, vad_data=vad_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.savez('WMF_K100_ML20M.npz', U=U_best, V=V_best)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
